{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NCBIblast service from BioServices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This example illustrate the NCBIBlast module usage within BioServices.\n",
    "\n",
    "<i>\"The emphasis of NCBIBlast is to find regions of sequence similarity, which will yield functional and evolutionary clues about the structure and function of your novel sequence.\" -- from NCBIblast web page.</i>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "from bioservices import *\n",
    "%pylab inline --no-import-all\n",
    "from pylab import hist"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<i>First, we need to get a FASTA sequence, which will be the input to the NCBIblast service. </i>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "u = UniProt()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sequence = u.retrieve(\"P43403\", \"fasta\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sequence alignement can be performed with the blastp tool via NCBIblast service."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "s = NCBIblast(verbose=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "jobid = s.run(program=\"blastp\", sequence=sequence, \n",
    "              stype=\"protein\", database=\"uniprotkb\", \n",
    "              email=\"test@whatever.co.uk\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BLASTP 2.2.29+\n",
      "\n",
      "\n",
      "Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro A.\n",
      "Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J.\n",
      "Lipman (1997), \"Gapped BLAST and PSI-BLAST: a new generation of\n",
      "protein database search programs\", Nucleic Acids Res. 25:3389-3402.\n",
      "\n",
      "\n",
      "Reference for composition-based statistics: Alejandro A. Schaffer,\n",
      "L. Aravind, Thomas L. Madden, Sergei Shavirin, John L. Spouge, Yuri\n",
      "I. Wolf, Eugene V. Koonin, and Stephen F. Altschul (2001),\n",
      "\"Improving the accuracy of PSI-BLAST protein database searches with\n",
      "composition-based statistics and other refinements\", Nucleic Acids\n",
      "Res. 29:2994-3005.\n",
      "\n",
      "\n",
      "\n",
      "Database: uniprotkb\n",
      "           53,372,207 sequences; 17,745,197,851 total letters\n",
      "\n",
      "\n",
      "\n",
      "Query= sp|P43403|ZAP70_HUMAN Tyrosine-protein kinase ZAP-70 OS=Homo sapiens\n",
      "GN=ZAP70 PE=1 SV=1\n",
      "\n",
      "Length=619\n",
      "                                                                      Score     E\n",
      "Sequences producing significant alignments:                          (Bits)  Value\n",
      "\n",
      "lcl|SP:ZAP70_HUMAN  P43403 Tyrosine-protein kinase ZAP-70 OS=Homo...   1279   0.0   \n",
      "lcl|TR:G3QGN8_GORGO  G3QGN8 Tyrosine-protein kinase OS=Gorilla go...   1278   0.0   \n",
      "lcl|TR:H2QIE3_PANTR  H2QIE3 Tyrosine-protei\n"
     ]
    }
   ],
   "source": [
    "print(s.getResult(jobid, \"out\")[0:1200])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lines = s.getResult(jobid, \"out\").split(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "entries = [l for l in lines if l.startswith(\"lcl\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 202.,   48.,   12.,   13.,   10.,    3.,    6.,    6.,   80.,\n",
       "          14.,    4.,   19.,    9.,   31.,    8.,    1.,    8.,    5.,\n",
       "          12.,    9.]),\n",
       " array([  205. ,   258.7,   312.4,   366.1,   419.8,   473.5,   527.2,\n",
       "          580.9,   634.6,   688.3,   742. ,   795.7,   849.4,   903.1,\n",
       "          956.8,  1010.5,  1064.2,  1117.9,  1171.6,  1225.3,  1279. ]),\n",
       " <a list of 20 Patch objects>)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD/CAYAAAD/qh1PAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADV5JREFUeJzt3cFuG9cVxvHvqGYSe1NZWndhFnkAqSq6NUJ5EaA7Gc4L\nlH4DB/EqKwM20heolQeIi3rZlU1h1o0DeR+HeQFJZrwwhCjy6YJ37AlFieLwUhQP/z+A8MzhzPAO\nZvzx6g6HNHcXACCupVk3AAAwXQQ9AARH0ANAcAQ9AARH0ANAcCOD3sza6fGwUntUPlepbZlZq1oD\nAMzemUFvZi1Jz919W1IzzUtS28x+lPRTWm5dkty9k+bXptdkAMA4RvXom5I203RX0o003Xb3T919\nJ83fkfS6stymAACXwpWznkw9+dK6pO/S9Erq3a+7+zeSliUdVJZdzdpKAEBtZwZ9KQ3N/ODuL6UP\nbwBmdqsynGPTaSIAYBLnCnpJLXe/L72/AHvg7k8l7as/vNOTtJKWvZ7qv2NmfNcCANTg7hN1pM/z\nqZu7aXimvDjblfQ8Pb0q6XtJT9QPfKk/jv/slMaGfXz99dczbwP7x/4t2r4twv7lMOpTN5uSHprZ\nKzM76Ge1dyRtmtmWpD13f+nuu2n5lqSepyEeAMDsjboY+1wfhmSq9adDauWF206epgEAcuDO2Exu\n3rw56yZMFfs3vyLvmxR//3KwXGNAI1/IzC/qtQAgCjOTT/tiLABgvhH0ABAcQQ8AwRH0ABAcQQ8A\nwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0\nABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAc\nQQ8AwRH0ABAcQQ8AwRH0ABDclVELmFk7Tf7Z3b9KtS1JPUlNd98+rQYAmL0ze/Rm1pL0PAV308xa\nZrYmSe7eScusmdn6YG26zQYAnNeoHn0zPbYlddP0LUnP0vNdSZuSVofUdgc3trOzo3fv3tVu7Gef\nfaalJUabAGAcZwb9wBDMuqQnkv4iab9SX5W0LOlgoHbC55//XZ988jeZ/WHshv7yS0e//vorQQ8A\nYxo5Ri9JaWjmB3ffNTNJsjov5i69efNfSdfGXndpqVHnJQFg4Z0r6CW13P1+mu5JWknTy/rQuy9r\n1/X7Hv97x8dHkh5Iaki6mR4AgFJRFCqKIus2zd3PXsDsrrs/TtMt9YdoNtx928zuqT82b4M1d385\nsB1vNK7q6GhPdXv0h4dv1WjQswewOMxM7l5rFKU06lM3m5IemtkrMzuQ5O6+m55rSeq5+8thtUka\nBQDIZ2SPPtsL0aMHgLFNvUcPAJh/BD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0Bw\nBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0A\nBEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQ\nA0Bw5wp6M3s0bN7M2pXalpm1qjUAwOyNDHozuytpa6DcNrMfJf2UllmXJHfvpPm1zO0EANQ0Mujd\n/bGk7kC57e6fuvtOmr8j6XWa7krazNdEAMAk6o7Rr6RhmntpflnSQeX51cmaBQDI5Uqdldx9W5LM\n7JaZtVLZsrUKAJDN2D16M2ubWTlmvy+pKaknaSXVrqc6AOASqNOj70p6kaZXJT1L8xuSOpJupNoJ\nx8dHkh5Iaki6mR4AgFJRFCqKIus2zd3PXsDstqTHkr50929TrezR33D3f6ZaW/03gWY5tDOwHW80\nruroaE/StbEburTU0OHhWzUajbHXBYB5ZWZy94mGxkcGfS4EPQCML0fQc2csAARH0ANAcAQ9AARH\n0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANA\ncAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9\nAARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AAR3rqA3s0cD81tm1jKz9lk1AMDsjQx6M7sraasy\nvy5J7t5J82vDalNpLQBgbCOD3t0fS+pWSnckvU7TXUmbqdYbqAEALoE6Y/TLkg4q86un1AAAl0Dd\ni7GWtRUAgKm5UmOdnqSVNL0saT9Nl7XrlRoAYMbqBP0TSRuSOpKakp6p38MvazdS7YTj4yNJDyQ1\nJN1MDwBAqSgKFUWRdZvm7mcvYHZb0mNJX7r7t6nWVv+ia9Pdt0+rDWzHG42rOjrak3Rt7IYuLTV0\nePhWjUZj7HUBYF6Zmdx9ouHykUGfC0EPAOPLEfTcGQsAwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAc\nQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwRH0ABAcQQ8AwdX5hSlgKLPJfkr4on4bAVg0\nBD0yqxvW/N48MC0M3QBAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH\n0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARH0ANAcAQ9AARXK+jN7FH6t12pbZlZ\nq1oDAMxe3R5928x+lPSTJJnZuiS5eyfNr+VpHgBgUrWD3t0/dfedNH9H0us03ZW0OXHLAABZ1A36\nlTRMcy/NL0s6qDy/OlmzAAC5XKmzkrtvS5KZ3TKzVipbtlYBALIZO+jTxdYDd38qaV9SU1JP0kpa\n5Hqqn3B8fCTpgaSGpJvpAQAoFUWhoiiybtPcfbwV+j34F+7+i5k9lPSd+r35DXffTsM5z9z95cB6\n3mhc1dHRnqRrYzd0aamhw8O3ajQaY6+Li2FmksY7nypra9xzEVgEZiZ3n2jEZOwevbt30kcpJWmv\nDHQz20hvAr3BkM/lo48+mmh9ggTAIhq7R1/7hTL06N+9+030GC8vevRAfjl69NwZCwDBEfQAEBxB\nDwDBEfQAEBxBDwDBEfQAEBxBDwDBEfQAEBxBDwDBEfQAEBxBDwDBEfQAEBxBDwDB1fqFKQCTSV/z\nXRvf9IlxEPTAzNT/SmdgHAzdAEBwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0BwBD0ABEfQA0Bw\n3BmLhcZXEWAREPQAX0WA4Bi6AYDgCHoACI6hG1wak4yXM1YOnI6gH4GLdReJsXJgGhYq6OuHNgEE\nzAqdrcktVNDXC2zCGpg9OluTWLCgv3iMOwOYNYJ+6uiJRDbpsMIsXrduB2Jeh1DobBH0wIRm9Ua+\naK87iXlsc17Zgt7MtiT1JDXdfTvXdjG+WfUyAVxOWW6YMrN1SXL3Tppfy7FdTMJrPiZdFxjOzGo9\nMLlcd8bekfQ6TXclbWba7hwpZt2AKStm3YAzTR4ixayafgGKWTcgmVYHophCW2PJFfTLkg4q86uZ\ntjtHiuxbvFw9oGJK281l0hApLrCtF62YdQOmrJh1Ay69nBdj+RsrOy4iAZhcrqDvSVpJ09cl7Q9b\naGlJ+vjjP9XqdR4e/la/dQDeY9x78ViOz4mmi68b7r5tZvckPXP3lwPLcLUOAGpw94nenbP06N19\n18w2zKwlqTcY8mkZuhEAMAPZvo/e3bfdvbMIn6FPf7WU01tm1jKz9lk1YFrM7NHA/LnOyXk5T4fs\nXzs9HlZqYfavUs+WM9l/eCTaQRhkZpuSbqXpE/cPzPM9BWa2no5LiIAYFC0AJcnM7kraqsyf65yc\nl/N0yP61JD1PHcpmOk5rUoz9q9Sz5kzWoI92EE5RvdbwhU7eP3BH/YvT1dq8+Mrdn0panveAGJTa\n2U3t7qZ9mftz090fq3+elYbd0zLsnJyL83TI/jX1oa3dNP+F4uzf+6cq0xPnTO4efaiDMMjM1soA\nSP6ok/cPzOU9BWZ2W9L3kuTu37j7rs4fGvOi/BO5mfYv2hu1NPz8O2/t0ktDxOXw8LqkF+rvS/WT\nfnO7f9J0ciZr0C/AQVgZUotykXlD0mrq1ZZjg5ECYlfSz2Z2oA/tD7N/A6Kck6dKf3n9kI6rFGuf\ns+fMVH4cPOJBGPIuK/3+/oHyDe1c9xRcUnvlMbP+l9RJAY6dJJnZsqRXktqSts3sRvnU7Fo1FaPO\nyQjnqSS13P1+mg6zf9PKmWl9TfGogyDN30FomllT/R7eShq7faJ+T7ij/jDVM/WDo6zdSLV5sC/p\n5zTdk/RXxTl2Uj/g/+Xub8ysJ+m2Yu1fadQ5Oe/nqczsrrt/k6ZbivX/cCo5M41P3Qw7CM1yJ1KD\nqrW5OAju/jRdqHT1x8y80vt9f//AsNrMGj2e/+jDMVmW9D8FOXYld3+T/u2oH/Jzv3/p2sqGmf1D\nej9ENfKcnJfzdHD/0qdRHprZqzQMN9f/D4ccv6nkTJY7YyuN3pT0b/XHOFck3Xb3nfQxta4q31U/\nrIbZSsfkQP27nO9XaiGOXbr20JW0cta+zOv+AafJGvQAgMtnKhdjAQCXB0EPAMER9AAQHEEPAMER\n9AAQHEEPAMER9AAQHEEPAMH9H0bCkYE+cjbXAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x5d197d0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "scores = [float(e.split()[-1]) for e in entries]\n",
    "lengths = [float(e.split()[-2]) for e in entries]\n",
    "hist(lengths, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
